rm(list=ls())

.generate_lollipop <- function(n){
 t(sapply(1:n, function(i){
  # generate "cluster" assignment
  k <- sample(c(1,2), 1)
  
  # generate "ball"
  if(k == 1){
   x <- stats::rnorm(1, mean = 0, sd = 1)
   y <- stats::rnorm(1, mean = 0, sd = 1)
  } else {
   # k == 2, generate "stick"
   x <- stats::runif(1, min = -1, max = 5)
   y <- x + stats::rnorm(1, mean = 0, sd = 0.5)
  }
  
  c(x,y)
 }))
}

.generate_v <- function(n){
 t(sapply(1:n, function(i){
  # generate "cluster" assignment
  k <- sample(c(1,2), 1, prob = c(0.7, 0.3))
  
  # generate "stick"
  if(k == 1){
   x <- stats::runif(1, min = 0, max = 5)
   y <- 2*x + stats::rnorm(1, mean = 0, sd = 0.5)
  } else {
   # k == 2, generate "stick"
   x <-  stats::runif(1, min = 0, max = 5)
   y <- 0.5*x + stats::rnorm(1, mean = 0, sd = 0.1)
  }
  
  c(x,y)
 }))
}

.generate_local <- function(n){
  t(sapply(1:n, function(i){
    k <- sample(1:3, 1, prob = c(0.4, 0.4, 0.2))
    
    # generate "left_ball"
    if (k == 1){
      x <- stats::rnorm(1, mean = -3, sd = 0.5)
      y <- stats::rnorm(1, mean = 0, sd = 0.2)
    } else if (k == 2) {
      # generate "right_ball"
      x <- stats::rnorm(1, mean = 3, sd = 0.3)
      y <- stats::rnorm(1, mean = 0, sd = 0.2)
    } else{
      # generate "local correlated data"
      x <- stats::runif(1, min = -1.5, max = 1.5)
      y <- 0.5*x + stats::rnorm(1, mean = 0.1, sd = 0.1) 
    }
    c(x,y)
  }))
}

.generate_quadratic <- function(n){
  t(sapply(1:n, function(i){
      x <- stats::runif(1, min = -2, max = 2)
      y <- x^2 + stats::rnorm(1, mean = 0, sd = 0.25) 
    c(x,y)
  }))
}

.generate_clusters_L <- function(n){
  t(sapply(1:n, function(i){
    k <- sample(1:4, 1, prob = rep(1/4, 4))
    
    if (k == 1){
      x <- stats::rnorm(1, mean = -1, sd = 0.1)
      y <- x + stats::rnorm(1, mean = 1, sd = 1)
    } else if (k == 2){
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -2, sd = 0.1)
    } else if (k == 3){
      x <- stats::rnorm(1, mean = -1, sd = 0.1)
      y <- x + stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else {
      x <- stats::rnorm(1, mean = -.5, sd = 0.2)
      y <- stats::rnorm(1, mean = -2, sd = 0.1)
    }
    
    c(x,y)
  }))
}

.generate_clusters1 <- function(n){
  t(sapply(1:n, function(i){
    k <- sample(1:4, 1, prob = seq(0.25, 1, by = 0.25))
    
    if (k == 1){
      x <- stats::runif(1, min = -2.5, max = -0.5)
      y <- -0.5 * x + stats::rnorm(1, mean = 0.2, sd = 0.2) 
    } else if (k == 2){
      x <- stats::rnorm(1, mean = -3, sd = 0.5)
      y <- stats::rnorm(1, mean = -2, sd = 0.3)
    } else if (k == 3){
      x <- stats::rnorm(1, mean = 0, sd = 0.5)
      y <- x + -1 * stats::rnorm(1, mean = 1, sd = 0.5)
    } else{
      x <- stats::rnorm(1, mean = 2, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    }
    
    c(x,y)
  }))
}

.generate_clusters2 <- function(n){
  t(sapply(1:n, function(i){
    k <- sample(1:5, 1, prob = rep(1/5, 5))
    if (k == 1) {
      x <- stats::runif(1, min = -0.5, max = 2)
      y <- -0.5 * x + stats::rnorm(1, mean = 1.25, sd = 0.1)
    } else if (k == 2) {
      x <- stats::rnorm(1, mean = -0.25, sd = 0.1)
      y <- stats::rnorm(1, mean = 0, sd = 0.1)
    } else if (k == 3) {
      x <- stats::rnorm(1, mean = 1, sd = 0.75)
      y <- stats::rnorm(1, mean = 0, sd = 0.1)
    } else if (k == 4) {
      x <- stats::runif(1, min = -0.5, max = 0.25)
      y <- x + stats::rnorm(1, mean = 1.25, sd = 0.25)
    } else {
      x <- stats::rnorm(1, mean = 0.25, sd = 0.25)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.1)
    }
    c(x,y)
  }))
}

plot(.generate_clusters_L(500))

plot(.generate_clusters2(500))

generate_data <- function(n, p){
 stopifnot(p %% 2 == 0)
 
 # generate p/2 2-dimensional datasets
 dat_list <- lapply(1:(p/2), function(round){
  type <- sample(1:7, 1)
  
  if(type == 1){
   .generate_lollipop(n)
  } else if (type == 2) {
   .generate_v(n)
  } else if (type == 3) {
   .generate_clusters_L(n)
  } else if (type == 4) {
   .generate_local(n)
  } else if (type == 5) {
   .generate_quadratic(n)
  } else if (type == 6) {
   .generate_clusters1(n)
  } else {
   .generate_clusters2(n)
  }
 })
 
dat_list <- lapply(dat_list, function(dat){
   dat[order(dat[,1]),]  
 })
 # concatenate column-wise
 do.call(cbind, dat_list)
}

set.seed(10)
dat <- generate_data(500, 10)
pairs(dat)

Method 1.A.I (a “randomized” ordering somewhat)

shuffling_function <- function(vec, window = 5){
  order_vec <- order(vec)
  
  for(i in 1:(length(order_vec)-window)){
    order_vec[i:(i+window)] <- sample(order_vec[i:(i+window)])
  }
  
  order_vec
}

dat1_A <- dat
new_order1 <- shuffling_function(dat1_A[,1], window = 10)
new_order3 <- shuffling_function(dat1_A[,3], window = 10)
new_order5 <- shuffling_function(dat1_A[,5], window = 10)
new_order7 <- shuffling_function(dat1_A[,7], window = 10)
new_order9 <- shuffling_function(dat1_A[,9], window = 10)

dat1_A_1 <- dat1_A
dat1_A_1[,c(1,2)] <- dat1_A[new_order1, c(1,2)]

dat1_A_2 <- dat1_A_1
dat1_A_2[,c(3,4)] <- dat1_A_1[new_order3, c(3,4)]

dat1_A_3 <- dat1_A_2
dat1_A_3[,c(5,6)] <- dat1_A_2[new_order5, c(5,6)]

dat1_A_4 <- dat1_A_3
dat1_A_4[,c(7,8)] <- dat1_A_3[new_order7, c(7,8)]

dat1_A_5 <- dat1_A_4
dat1_A_5[,c(9,10)] <- dat1_A_4[new_order9, c(9,10)]

for (elem in list(dat1_A, dat1_A_1, dat1_A_2, dat1_A_3, dat1_A_4, dat1_A_5)){
  pairs(elem)
}

Method 1.A.II

quadratic_ordering <- function(vec){
  order_vec <- order(vec)
  
  n <- length(order_vec)
  odd_idx <- seq(1, n, by = 2)
  even_idx <- seq(2, n, by = 2)
  
  new_order_vec <- c(order_vec[odd_idx], rev(order_vec[even_idx]))
  new_order_vec
}

dat_II <- dat
new_order_II_1 <- quadratic_ordering(dat_II[,1])
new_order_II_3 <- quadratic_ordering(dat_II[,3])
new_order_II_5 <- quadratic_ordering(dat_II[,5])
new_order_II_7 <- quadratic_ordering(dat_II[,7])
new_order_II_9 <- quadratic_ordering(dat_II[,9])

dat_II_1 <- dat_II
dat_II_1[,c(1,2)] <- dat_II[order(dat[,1]), c(1,2)]

dat_II_3 <- dat_II
dat_II_3[,c(3,4)] <- dat_II[new_order_II_3, c(3,4)]

dat_II_5 <- dat_II
dat_II_5[,c(5,6)] <- dat_II[new_order_II_5, c(5,6)]

dat_II_7 <- dat_II
dat_II_7[,c(7,8)] <- dat_II[new_order_II_7, c(7,8)]

dat_II_9 <- dat_II
dat_II_9[,c(9,10)] <- dat_II[new_order_II_9, c(9,10)]

for (i in list(dat_II_1, dat_II_3, dat_II_5, dat_II_7, dat_II_9)){
  pairs(i)
}

Method 1.B

All we’re going to do is introduce the notion of “cell-types”/“clusters”

set.seed(10)
cluster_label <- sample(1:7, 100, replace = T)

dat4 <- dat
for(k in 1:max(cluster_label)){
  idx <- which(cluster_label == k)
  
  dat4[idx,c(1,2)] <- dat4[idx[order(dat4[idx,1])], c(1,2)]
  dat4[idx,c(7,8)] <- dat4[idx[order(dat4[idx,7])], c(7,8)]
}

pairs(dat4)

  1. Generating p/2 bi-variate datasets
  2. Concatenate all the p/2 together

New method 1. Generating p/2 bi-variate datasets 2. reshuffle rows for each dataset of p/2 2.1 order all the rows to be sorted in terms of respective first variable (x).